import pandas as pd
import numpy as np
import plotly.graph_objs as go #visualization library
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf #autocorrelation test
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller #stationarity test
from statsmodels.tsa.statespace.sarimax import SARIMAX
from datetime import datetime, timedelta
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from xgboost import XGBRegressor
from scipy import stats
import holidays
import warnings
warnings.filterwarnings("ignore")
import os
import math
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import *
from keras.callbacks import EarlyStopping
years=[2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015]
lst=[]
for date in holidays.UnitedStates(years=years).items():
print(str(date[0]))
lst.append(str(date[0]))
d=pd.DataFrame(lst)
d=d.astype(str)
d[0] = d[0].astype('datetime64[ns]')
data=pd.read_csv('C:\\Users\\verma\\Downloads\\data2.csv')
data2=data
data2.head()
data2['holiday']=[1500 if str(val).split()[0] in lst else 0 for val in data2['Pickup_Date']]
data2
dataset_2=pd.read_csv('C:\\Users\\verma\\Downloads\\dataset2.csv')
data.isnull().sum()
dataset_2.head()
data=data.drop(['Unnamed: 0'],1)
data.index=data.Pickup_Date
data=data.drop(['Pickup_Date'],1)
data.head()
fig = go.Figure(
data=[go.Scatter(y=data['Number of Trips'], x=data.index, name= 'Trips'),
go.Scatter(y=data['Number of Vehicles'], x=data.index, name= 'Vehicles'),
go.Scatter(y=data['holiday'], x=data.index, name= 'Holidays')],
layout=go.Layout(
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
)
)
fig.update_layout(title_text="Number of Trips and vehicles")
fig.show()
data.index=pd.to_datetime(data.index , format = '%Y-%m-%d')
np.dtype(data.index)
data_monthly = data.resample('M').mean() #mensal resampling
data_weekly = data.resample('W').mean() #weekly resampling
data_bimonthly = data.resample('2M').mean() #bimonthy resamply
data_yearly=data.resample('2Y').mean()
data_yearly
data3=data['Number of Vehicles']
data3=data3.to_frame()
data3['Number of Vehicles'].replace([' - '], [110], inplace=True)
set(data3['Number of Vehicles'])
data3 = data3.astype({'Number of Vehicles':int})
np.dtype(data3['Number of Vehicles'])
#data2=data2.drop('Pickup_Date',1)
np.dtype(data3['Number of Vehicles'])
data3_monthly = data3.resample('M').mean() #mensal resampling
data3_weekly = data3.resample('W').mean() #weekly resampling
data3_bimonthly = data3.resample('2M').mean() #bimonthly resampling
data3_monthly
fig = go.Figure(
data=[go.Scatter(y=data_monthly['Number of Trips'], x=data_monthly.index, name= 'Number of Trips'),
go.Scatter(y=data3_monthly['Number of Vehicles'],x=data3_monthly.index,name='Number of Vehicles')],
layout=go.Layout(
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
title_text="Data - Monthly"
)
)
fig.show()
fig2 = go.Figure(
data=[go.Scatter(y=data_weekly['Number of Trips'], x=data_weekly.index, name= 'Number of Trips'),
go.Scatter(y=data3_weekly['Number of Vehicles'],x=data3_weekly.index,name='Number of Vehicles')],
layout=go.Layout(
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
title_text="Data - Weekly"
)
)
fig2.show()
fig3 = go.Figure(
data=[go.Scatter(y=data_bimonthly['Number of Trips'], x=data_bimonthly.index, name= 'Number of Trips'),
go.Scatter(y=data3_bimonthly['Number of Vehicles'],x=data3_bimonthly.index,name='Number of Vehicles')],
layout=go.Layout(
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
title_text="Data - BiMonthly"
)
)
fig3.show()
sns.distplot(data_monthly['Number of Trips'],color='r').set_title('Monthly data ')
sns.distplot(data_bimonthly['Number of Trips'],color='y').set_title('Bimonthly data ')
sns.distplot(data_weekly['Number of Trips'],color='b').set_title('Weekly data ')
dataset_2.columns
def test_stationarity(timeseries):
rolmean = timeseries.rolling(window=30).mean()
rolstd = timeseries.rolling(window=30).std()
plt.figure(figsize=(14,5))
sns.despine(left=True)
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best'); plt.title('Rolling Mean & Standard Deviation')
plt.show()
print ('<Results of Dickey-Fuller Test>')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4],
index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)
test_stationarity(data['Number of Trips'].dropna())
plt.subplot(1,2,1)
data_monthly['Number of Trips'].hist(bins=50)
plt.title('Number of Trips')
plt.subplot(1,2,2)
stats.probplot(data_monthly['Number of Trips'], plot=plt);
data_monthly.describe().T
plt.subplot(1,2,1)
data_bimonthly['Number of Trips'].hist(bins=50)
plt.title('Number of Trips')
plt.subplot(1,2,2)
stats.probplot(data_bimonthly['Number of Trips'], plot=plt);
data_bimonthly.describe().T
plt.subplot(1,2,1)
data_weekly['Number of Trips'].hist(bins=50)
plt.title('Number of Trips')
plt.subplot(1,2,2)
stats.probplot(data_weekly['Number of Trips'], plot=plt);
data_weekly.describe().T
plt.subplot(1,2,1)
data['Number of Trips'].hist(bins=50)
plt.title('Number of Trips')
plt.subplot(1,2,2)
stats.probplot(data['Number of Trips'], plot=plt);
data.describe().T
test_stationarity(data_monthly['Number of Trips'].dropna())
test_stationarity(data_weekly['Number of Trips'].dropna())
decomposed = sm.tsa.seasonal_decompose(np.array(data_monthly['Number of Trips']),freq=6) # The frequency is semestral
figure = decomposed.plot()
plt.show()
plot_acf(data_monthly['Number of Trips'],lags=12,title="ACF Number of Trips")
plt.show()
plot_pacf(data_monthly['Number of Trips'],lags=12,title="PACF Number of Trips")
plt.show()
#The test statistic is negative, we reject the null hypothesis (it looks stationary).
def sarimax_predictor(actual, order, seasonal_order, t , start, title):
mdl = sm.tsa.statespace.SARIMAX(actual[start:],
order=order, seasonal_order=seasonal_order,
enforce_stationarity=False,
enforce_invertibility=False)
results = mdl.fit()
# results.plot_diagnostics()
print(results.summary())
predict = results.predict(start=start, end=len(actual)+t)
for i in range(len(predict)):
if(predict[i]<0):
predict[i]=predict[i]*-1
fig = go.Figure(
data=[go.Scatter(y=actual[0:-t],x=actual[0:-t].index, name= 'Actual'),
go.Scatter(y=actual[-t-1::],x=actual[-t-1::].index, name= 'Test'),
go.Scatter(y=predict, x=predict.index, name= 'Predict')],
layout=go.Layout(
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
)
)
fig.update_layout(title_text= title)
fig.show()
return predict
#defined function when we need to use exogenous variables
# def sarimax_predictor_exog(actual, order, seasonal_order, t, title, exog):
# mdl = sm.tsa.statespace.SARIMAX(actual[0:-t],
# order=order, seasonal_order=seasonal_order, exog = exog[0:-t],
# enforce_stationarity=False,
# enforce_invertibility=False, time_varying_regression = False,
# mle_regression = True)
# results = mdl.fit()
# results.plot_diagnostics()
# print(results.summary())
# #use only exogenous to forecasting (test set)
# predict = results.predict(start=0, end=len(actual), exog=exog[-t-1::])
# fig = go.Figure(
# data=[go.Scatter(y=actual[0:-t],x=actual[0:-t].index, name= 'Actual'),
# go.Scatter(y=actual[-t-1::],x=actual[-t-1::].index, name= 'Test'),
# go.Scatter(y=predict, x=predict.index, name= 'Predict')],
# layout=go.Layout(
# xaxis=dict(showgrid=False),
# yaxis=dict(showgrid=False),
# )
# )
# fig.update_layout(title_text= title)
# fig.show()
# return predict
#evaluation test
def rmse(actual, predict, title):
from sklearn import metrics
rmse = np.sqrt(metrics.mean_squared_error(actual, predict))
print('The RMSE of ' + title + ' is:', rmse)
start = 0
predicted_results = sarimax_predictor(data_monthly['Number of Trips'], [5,1,3], [1,1,0,26], 12, 1,
'Monthly forecast - Number of trips')
# print(rmse(data_monthly,predicted_results[:132],'checking'))
start = 0
predicted_results2 = sarimax_predictor(data_weekly['Number of Trips'], [1,1,1], [1,1,0,24], 7*2, start,
'Weekly forecast - Number of trips')
#print(rmse(data_weekly,predicted_results2[:575],'checking'))
predicted_result_foods_bimonthly = sarimax_predictor(data_bimonthly['Number of Trips'], [8,1,0], [1,0,0,6], 5, 1,
'Bimonthly forecast')
# print(rmse(data_bimonthly,predicted_result_foods_bimonthly[:67],'checking'))
d=data.ix[3800:4017,0:1]
d
predicted_result_daily = sarimax_predictor(d['Number of Trips'], [5,1,4], [1,1,0,21], 50, 0,
'Daily forecast')
data2.index=data2['Unnamed: 0']
np.dtype(data2['Number of Vehicles'])
#data2['Number of Vehicles'] = data2['Number of Vehicles'].map({' - ': 12})
set(data2['Number of Vehicles'])
data2['Number of Vehicles'].replace([' - '], [110], inplace=True)
data2 = data2.astype({'Number of Vehicles':int})
np.dtype(data2['Number of Vehicles'])
data2=data2.drop('Pickup_Date',1)
data2
# x_train=x_train.to_frame()
x_train,x_test,y_train,y_test=train_test_split(data2,data2['Number of Vehicles'],test_size=0.3,shuffle=False)
y_train=y_train.to_frame()
print(type(x_train))
x_train.head()
xx=data2
x_train=x_train.drop('Number of Vehicles',1)
data2
x_test=x_test.drop('Number of Vehicles',1)
num_folds = 10
seed = 0
scoring = 'neg_mean_squared_error'
kfold = KFold(n_splits=num_folds, random_state=seed)
model = XGBRegressor(n_estimators=70 , learning_rate = .1)
score_= cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
model.fit(x_train, y_train)
prediction=model.predict(x_train)
predictions = model.predict(x_test)
print(metrics.r2_score(y_test, predictions))
rmse = np.sqrt(metrics.mean_squared_error(y_test, predictions))
print('Root Mean Square Error',rmse)
data2[['Number of Trips','Number of Vehicles','holiday']].corr(method='pearson')
reg=LinearRegression()
reg.fit(x_train,y_train)
a=reg.predict(x_test)
data2
cols=['Unnamed: 0','Number of Trips','holiday']
b=reg.predict(data2[cols])
b
c=model.predict(data2[cols])
b=b.reshape(4017,)
c.shape
fig = go.Figure(
data=[go.Scatter(y=data2['Number of Vehicles'],x=data2.index, name= 'Actual'),
go.Scatter(y=b,x=data2.index, name= 'sklearn'),
go.Scatter(y=c,x=data2.index, name= 'xgboost')],
layout=go.Layout(
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
)
)
fig.show()
# 1st part almost done just aggregate for some predicted data.
# MAPA,,LSTM,,then again forecasting of latitudes and longitudes,,Geopandas.
data2.index=data.index
data2
predicted_results
def alpha(actual, predict):
RMSE =[]
for i in np.arange(0.5,15, 0.01):
RMSE.append([i,np.sqrt(metrics.mean_squared_error(actual, predict*i))])
return np.array(RMSE)[np.argmin(np.array(RMSE)[:,1]),0]
#step of combination
predicted_result_foods_bimonthly1 = predicted_result_foods_bimonthly.resample('W').mean()
predicted_results1 = predicted_results.resample('W').mean()
#equally assigns the mean value of the low frequency time series
predictions_foods = pd.DataFrame({'bimonthly': predicted_result_foods_bimonthly1.groupby(predicted_result_foods_bimonthly1.notnull().cumsum()).transform(lambda x : x.sum()/len(x)),
'monthly': predicted_results1.groupby(predicted_results1.notnull().cumsum()).transform(lambda x : x.sum()/len(x)),
'weekly': predicted_results2[1::]})
prediction_foods_mean = pd.DataFrame.mean(predictions_foods, axis = 1)
prediction_foods_median = pd.DataFrame.median(predictions_foods, axis = 1)
prediction_foods_min = pd.DataFrame.min(predictions_foods, axis = 1)
prediction_foods_max = pd.DataFrame.max(predictions_foods, axis = 1)
alpha_foods_max = alpha(data_weekly['Number of Trips'][start+1:-28], prediction_foods_max[start+1:-3*28-8])
alpha_foods_min = alpha(data_weekly['Number of Trips'][start+1:-28], prediction_foods_min[start+1:-3*28-8])
alpha_foods_mean = alpha(data_weekly['Number of Trips'][start+1:-28], prediction_foods_mean[start+1:-3*28-8])
alpha_median = alpha(data_weekly['Number of Trips'][start+1:-28], prediction_foods_median[start+1:-3*28-8])
fig = go.Figure(
data=[go.Scatter(y=data_weekly['Number of Trips'], x = data_weekly.index, name= 'Actual'),
go.Scatter(y=prediction_foods_min[0:-1]*alpha_foods_min, x= data_weekly.index, name= 'Predict Min'),
go.Scatter(y=prediction_foods_max[0:-1]*alpha_foods_max, x= data_weekly.index, name= 'Predict Max'),
go.Scatter(y=prediction_foods_mean[0:-1]*alpha_foods_mean, x= data_weekly.index, name= 'Predict Mean'),
go.Scatter(y=prediction_foods_median[0:-1]*alpha_median, x= data_weekly.index, name= 'Predict median')],
layout=go.Layout(
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
)
)
fig.update_layout(title_text="Foods Category - MAPA SARIMA forecast")
fig.show()
# LSTM from here
data_monthly
data
dataset = data['Number of Trips'].values #numpy.ndarray
dataset = dataset.astype('float32')
dataset = np.reshape(dataset, (-1, 1))
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)
train_size = int(len(dataset) * 0.80)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
def create_dataset(dataset, look_back=10):
X, Y = [], []
for i in range(len(dataset)-look_back-1):
a = dataset[i:(i+look_back), 0]
X.append(a)
Y.append(dataset[i + look_back, 0])
return np.array(X), np.array(Y)
look_back = 30
X_train, Y_train = create_dataset(train, look_back)
X_test, Y_test = create_dataset(test, look_back)
# reshape input to be [samples, time steps, features]
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
data['Number of Trips'].shape
model = Sequential()
model.add(LSTM(100, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.7))
model.add(LSTM(80))
model.add(Dropout(0.7))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
history = model.fit(X_train, Y_train, epochs=10, batch_size=40, validation_data=(X_test, Y_test),
callbacks=[EarlyStopping(monitor='val_loss', patience=10)], verbose=1, shuffle=False)
model.summary()
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)
# invert predictions
train_predict = scaler.inverse_transform(train_predict)
Y_train = scaler.inverse_transform([Y_train])
test_predict = scaler.inverse_transform(test_predict)
Y_test = scaler.inverse_transform([Y_test])
print('Train Mean Absolute Error:', mean_absolute_error(Y_train[0], train_predict[:,0]))
print('Train Root Mean Squared Error:',np.sqrt(mean_squared_error(Y_train[0], train_predict[:,0])))
print('Test Mean Absolute Error:', mean_absolute_error(Y_test[0], test_predict[:,0]))
print('Test Root Mean Squared Error:',np.sqrt(mean_squared_error(Y_test[0], test_predict[:,0])))
plt.figure(figsize=(8,4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Test Loss')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.legend(loc='upper right')
plt.show()
aa=[x for x in range(200)]
plt.figure(figsize=(8,4))
plt.plot(aa, Y_train[0][:200], marker='.', label="actual")
plt.plot(aa, train_predict[:,0][:200], 'r', label="prediction")
# plt.tick_params(left=False, labelleft=True) #remove ticks
plt.tight_layout()
sns.despine(top=True)
plt.ylabel('Number of Trips', size=15)
plt.xlabel('Time step', size=15)
plt.legend(loc='best',fontsize=10)
plt.title("Training Data Predictions")
plt.show()
aa=[x for x in range(200)]
plt.figure(figsize=(8,4))
plt.plot(aa, Y_test[0][:200], marker='.', label="actual")
plt.plot(aa, test_predict[:,0][:200], 'r', label="prediction")
# plt.tick_params(left=False, labelleft=True) #remove ticks
plt.tight_layout()
sns.despine(top=True)
plt.ylabel('Number of Trips', size=15)
plt.xlabel('Time step', size=15)
plt.legend(loc='best',fontsize=15)
plt.title('Test Data Predictions')
plt.show()
dataset_2.drop('Unnamed: 0',1,inplace=True)
dataset_2
plt.scatter('distance_travelled','start_location_lat',data=dataset_2)
#plt.scatter('distance_travelled','start_location_long',data=dataset_2)
plt.legend(loc='best')
plt.xlabel('Distance travelled')
plt.ylabel('Degrees')
plt.title('Data Visualization')
plt.scatter('distance_travelled','start_location_long',data=dataset_2,color='r')
plt.legend(loc='best')
plt.xlabel('Distance travelled')
plt.ylabel('Degrees')
plt.title('Data Visualization')
fig = go.Figure(
data=[go.Scatter(y=dataset_2['start_location_lat'], x=dataset_2.completed_on, name= 'Latitude')
],
layout=go.Layout(
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
)
)
fig.update_layout(title_text="Pickup Location ")
fig.show()
fig = go.Figure(
data=[
go.Scatter(y=dataset_2['start_location_long'], x=dataset_2.completed_on, name= 'Longitude'),],
layout=go.Layout(
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
)
)
fig.update_layout(title_text="Pickup Location ")
fig.show()
fig = go.Figure(
data=[go.Scatter(y=dataset_2['distance_travelled'], x=dataset_2.completed_on, name= 'Distance')],
layout=go.Layout(
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
)
)
fig.update_layout(title_text="Distance Travelled ")
fig.show()
dataset_2.index=dataset_2.completed_on
dataset_2.drop('completed_on',1,inplace=True)
dataset_2.index=pd.to_datetime(dataset_2.index )
dataset_2.isnull().sum()
dataset_2['distance_travelled']=dataset_2['distance_travelled'].fillna(method='ffill')
data_monthly_2 = dataset_2.resample('M').mean() #mensal resampling
data_weekly_2 = dataset_2.resample('W').mean() #weekly resampling
data_bimonthly_2 = dataset_2.resample('2M').mean() #bimonthy resamply
data_weekly_2
sns.distplot(data_monthly_2.distance_travelled,color='r').set_title('Distance Travelled')
sns.distplot(data_bimonthly_2.distance_travelled,color='b').set_title('Distance Travelled')
sns.distplot(data_weekly_2.distance_travelled,color='y').set_title('Distance Travelled')
sns.distplot(data_monthly_2.start_location_long,color='r').set_title('Start Location Longitude Monthly')
sns.distplot(data_bimonthly_2.start_location_long,color='b').set_title('Start Location Longitude Bimonthly')
sns.distplot(data_weekly_2.start_location_long,color='y').set_title('Start Location Longitude Weekly')
fig = go.Figure(
data=[go.Scatter(y=data_weekly_2['distance_travelled'], x=data_weekly_2.index, name= 'Distance')],
layout=go.Layout(
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
)
)
fig.update_layout(title_text="Distance Travelled ")
fig.show()
fig = go.Figure(
data=[go.Scatter(y=data_monthly_2['distance_travelled'], x=data_monthly_2.index, name= 'Distance')],
layout=go.Layout(
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
)
)
fig.update_layout(title_text="Distance Travelled ")
fig.show()
fig = go.Figure(
data=[go.Scatter(y=data_bimonthly_2['distance_travelled'], x=data_bimonthly_2.index, name= 'Distance')],
layout=go.Layout(
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
)
)
fig.update_layout(title_text="Distance Travelled ")
fig.show()
# defined function when we need to use exogenous variables
def sarimax_predictor_exog(actual, order, seasonal_order, t, title, exog):
mdl = sm.tsa.statespace.SARIMAX(actual[:],
order=order, seasonal_order=seasonal_order, exog = exog[:],
enforce_stationarity=False,
enforce_invertibility=False, time_varying_regression = False,
mle_regression = True)
results = mdl.fit()
#results.plot_diagnostics()
print(results.summary())
#use only exogenous to forecasting (test set)
predict = results.predict(start=0, end=len(actual), exog=exog[0:1 :])
fig = go.Figure(
data=[go.Scatter(y=actual[0:-t],x=actual[0:-t].index, name= 'Actual'),
go.Scatter(y=actual[-t-1::],x=actual[-t-1::].index, name= 'Test'),
go.Scatter(y=predict[2:], x=predict.index[2:], name= 'Predict')],
layout=go.Layout(
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
)
)
fig.update_layout(title_text= title)
fig.show()
return predict
exog=dataset_2['distance_travelled']
exog.index=dataset_2.index
print(np.dtype(exog.index))
print(exog.head())
# cols=['start_location_lat','start_location_long']
# exog1=dataset_2[cols]
# exog1.index=dataset_2.index
# print(exog1.head())
test_stationarity(dataset_2['distance_travelled'].dropna())
test_stationarity(dataset_2['start_location_lat'].dropna())
test_stationarity(dataset_2['start_location_long'].dropna())
plt.subplot(1,2,1)
data_monthly_2['distance_travelled'].hist(bins=50)
plt.title('distance_travelled Monthly')
plt.subplot(1,2,2)
stats.probplot(data_monthly_2['distance_travelled'], plot=plt);
data_monthly_2.describe().T
plt.tight_layout()
plt.subplot(1,2,1)
data_bimonthly_2['distance_travelled'].hist(bins=50)
plt.title('distance_travelled Bimonthly')
plt.subplot(1,2,2)
stats.probplot(data_bimonthly_2['distance_travelled'], plot=plt);
data_bimonthly_2.describe().T
plt.tight_layout()
plt.subplot(1,2,1)
data_weekly_2['distance_travelled'].hist(bins=50)
plt.title('distance_travelled Weekly')
plt.subplot(1,2,2)
stats.probplot(data_weekly_2['distance_travelled'], plot=plt);
data_weekly_2.describe().T
plt.tight_layout()
plt.subplot(1,2,1)
dataset_2['distance_travelled'].hist(bins=50)
plt.title('distance_travelled Daily ')
plt.subplot(1,2,2)
stats.probplot(dataset_2['distance_travelled'], plot=plt);
dataset_2.describe().T
plt.tight_layout()
decomposed = sm.tsa.seasonal_decompose(np.array(data_weekly_2['distance_travelled']),freq=6) # The frequency is semestral
figure = decomposed.plot()
decomposed = sm.tsa.seasonal_decompose(np.array(data_weekly_2['start_location_lat']),freq=6) # The frequency is semestral
figure = decomposed.plot()
decomposed = sm.tsa.seasonal_decompose(np.array(data_weekly_2['start_location_long']),freq=6) # The frequency is semestral
figure = decomposed.plot()
plt.show()
plot_acf(data_weekly_2['distance_travelled'],lags=12,title="ACF Number of Trips")
plt.show()
plot_pacf(data_weekly_2['distance_travelled'],lags=12,title="PACF Number of Trips")
plt.show()
cols=['start_location_lat','start_location_long']
exog1=dataset_2[cols]
exog_w=data_weekly_2[cols]
exog_m=data_monthly_2[cols]
exog_b=data_bimonthly_2[cols]
predicted_result = sarimax_predictor_exog(dataset_2.distance_travelled, [1,0,1], [2,1,0,7], 28*7,'Daily forecast - Foods', exog1)
predicted_result = sarimax_predictor_exog(data_weekly_2.distance_travelled, [4,0,2], [1,0,2,2], 6*1,'Weekly forecast - Foods', exog_w)
predicted_result = sarimax_predictor_exog(data_monthly_2.distance_travelled, [2,0,1], [3,0,0,1], 6*1,'Weekly forecast - Foods', exog_m)
predicted_result = sarimax_predictor_exog(data_bimonthly_2.distance_travelled, [1,0,0], [1,0,0,1], 2*1,'Bimonthly forecast - Foods', exog_b)
exog_2=dataset_2['distance_travelled']
exog_2_w=data_weekly_2['distance_travelled']
exog_2_m=data_monthly_2['distance_travelled']
exog_2_b=data_bimonthly_2['distance_travelled']
exog_2=exog_2.to_frame()
exog_2_w=exog_2_w.to_frame()
exog_2_m=exog_2_m.to_frame()
exog_2_b=exog_2_b.to_frame()
exog_2.fillna(method='ffill',inplace=True)
predicted_result = sarimax_predictor_exogs(dataset_2.start_location_lat, [1,1,0], [0,0,0,1], 3*1,'Dailly forecast - Foods', exog_2)
predicted_result = sarimax_predictor_exogs(data_weekly_2.start_location_lat, [1,1,0], [0,0,0,1], 3*1,'Weekly forecast - Foods', exog_2_w)
predicted_result = sarimax_predictor_exogs(data_monthly_2.start_location_lat, [1,1,0], [0,0,0,1], 3*1,'Monthly forecast - Foods', exog_2_m)
def sarimax_predictor_exogs(actual, order, seasonal_order, t, title, exog):
mdl = sm.tsa.statespace.SARIMAX(actual[:],
order=order, seasonal_order=seasonal_order, exog = exog[:],
enforce_stationarity=False,
enforce_invertibility=False, time_varying_regression = False,
mle_regression = True)
results = mdl.fit()
#results.plot_diagnostics()
print(results.summary())
#use only exogenous to forecasting (test set)
predict = results.predict(start=0, end=len(actual), exog=exog[0:1:])
for i in range(len(predict)):
dec=predict[i]-int(predict[i])
predict[i]=30+dec
fig = go.Figure(
data=[go.Scatter(y=actual[0:-t],x=actual[0:-t].index, name= 'Actual'),
go.Scatter(y=actual[-t-1::],x=actual[-t-1::].index, name= 'Test'),
go.Scatter(y=predict[2::], x=predict.index[2::], name= 'Predict')],
layout=go.Layout(
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
)
)
fig.update_layout(title_text= title)
fig.show()
return predict
predicted_result = sarimax_predictor_exog(dataset_2.start_location_long, [1,1,0], [0,0,0,1], 3*1,'Dailly forecast - Foods', exog_2)
# data_weekly_2.iloc[0,3]=-97.471979
# data_weekly_2
predicted_result = sarimax_predictor_exog(data_weekly_2.start_location_long, [1,1,0], [0,0,0,1], 3*1,'Weekly forecast - Foods', exog_2_w)
predicted_result = sarimax_predictor_exog(data_monthly_2.start_location_long, [1,1,0], [0,0,0,1], 3*1,'Monthly forecast - Foods', exog_2_m)
dataset_2
dataset = dataset_2['start_location_lat'].values #numpy.ndarray
dataset = dataset.astype('float32')
dataset = np.reshape(dataset, (-1, 1))
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)
train_size = int(len(dataset) * 0.80)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
def create_dataset(dataset, look_back=10):
X, Y = [], []
for i in range(len(dataset)-look_back-1):
a = dataset[i:(i+look_back), 0]
X.append(a)
Y.append(dataset[i + look_back, 0])
return np.array(X), np.array(Y)
look_back = 30
X_train, Y_train = create_dataset(train, look_back)
X_test, Y_test = create_dataset(test, look_back)
# reshape input to be [samples, time steps, features]
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
dataset_2['start_location_lat'].shape
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(30))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
history = model.fit(X_train, Y_train, epochs=5, batch_size=20, validation_data=(X_test, Y_test),
callbacks=[EarlyStopping(monitor='val_loss', patience=10)], verbose=1, shuffle=False)
model.summary()
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)
# invert predictions
train_predict = scaler.inverse_transform(train_predict)
Y_train = scaler.inverse_transform([Y_train])
test_predict = scaler.inverse_transform(test_predict)
Y_test = scaler.inverse_transform([Y_test])
print('Train Mean Absolute Error:', mean_absolute_error(Y_train[0], train_predict[:,0]))
print('Train Root Mean Squared Error:',np.sqrt(mean_squared_error(Y_train[0], train_predict[:,0])))
print('Test Mean Absolute Error:', mean_absolute_error(Y_test[0], test_predict[:,0]))
print('Test Root Mean Squared Error:',np.sqrt(mean_squared_error(Y_test[0], test_predict[:,0])))
plt.figure(figsize=(8,4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Test Loss')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.legend(loc='upper right')
plt.show()
aa=[x for x in range(200)]
plt.figure(figsize=(8,4))
plt.plot(aa, Y_test[0][:200], marker='.', label="actual")
plt.plot(aa, test_predict[:,0][:200], 'r', label="prediction")
# plt.tick_params(left=False, labelleft=True) #remove ticks
plt.tight_layout()
sns.despine(top=True)
plt.ylabel('start_location_lat', size=15)
plt.xlabel('Time step', size=15)
plt.legend(loc='best',fontsize=15)
plt.title('Test Data Predictions')
plt.show()
dataset = dataset_2['start_location_long'].values #numpy.ndarray
dataset = dataset.astype('float32')
dataset = np.reshape(dataset, (-1, 1))
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)
train_size = int(len(dataset) * 0.80)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
def create_dataset(dataset, look_back=10):
X, Y = [], []
for i in range(len(dataset)-look_back-1):
a = dataset[i:(i+look_back), 0]
X.append(a)
Y.append(dataset[i + look_back, 0])
return np.array(X), np.array(Y)
look_back = 30
X_train, Y_train = create_dataset(train, look_back)
X_test, Y_test = create_dataset(test, look_back)
# reshape input to be [samples, time steps, features]
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
dataset_2['start_location_lat'].shape
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(30))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
history = model.fit(X_train, Y_train, epochs=5, batch_size=20, validation_data=(X_test, Y_test),
callbacks=[EarlyStopping(monitor='val_loss', patience=10)], verbose=1, shuffle=False)
model.summary()
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)
# invert predictions
train_predict = scaler.inverse_transform(train_predict)
Y_train = scaler.inverse_transform([Y_train])
test_predict = scaler.inverse_transform(test_predict)
Y_test = scaler.inverse_transform([Y_test])
print('Train Mean Absolute Error:', mean_absolute_error(Y_train[0], train_predict[:,0]))
print('Train Root Mean Squared Error:',np.sqrt(mean_squared_error(Y_train[0], train_predict[:,0])))
print('Test Mean Absolute Error:', mean_absolute_error(Y_test[0], test_predict[:,0]))
print('Test Root Mean Squared Error:',np.sqrt(mean_squared_error(Y_test[0], test_predict[:,0])))
plt.figure(figsize=(8,4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Test Loss')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.legend(loc='upper right')
plt.show()
aa=[x for x in range(200)]
plt.figure(figsize=(8,4))
plt.plot(aa, Y_test[0][:200], marker='.', label="actual")
plt.plot(aa, test_predict[:,0][:200], 'r', label="prediction")
# plt.tick_params(left=False, labelleft=True) #remove ticks
plt.tight_layout()
sns.despine(top=True)
plt.ylabel('start_location_long', size=15)
plt.xlabel('Time step', size=15)
plt.legend(loc='best',fontsize=15)
plt.title('Test Data Predictions')
plt.show()
cols=['distance_travelled','end_location_lat','end_location_long','start_location_long','start_location_lat']
print(dataset_2.head())
data_raw = dataset_2.values.astype("float32")
#data_raw=data_raw.reshape(-1,1)
scaler = MinMaxScaler(feature_range = (0, 1))
data_raw = scaler.fit_transform(data_raw)
data_raw
dataset_3=pd.DataFrame(data_raw)
dataset_3.index=dataset_2.index
np.dtype(dataset_3.index)
dataset_3.columns=cols
dataset_3.head()
dataset = dataset_2['distance_travelled'].values #numpy.ndarray
dataset = dataset.astype('float32')
dataset = np.reshape(dataset, (-1, 1))
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)
train_size = int(len(dataset) * 0.80)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
def create_dataset(dataset, look_back=10):
X, Y = [], []
for i in range(len(dataset)-look_back-1):
a = dataset[i:(i+look_back), 0]
X.append(a)
Y.append(dataset[i + look_back, 0])
return np.array(X), np.array(Y)
look_back = 30
X_train, Y_train = create_dataset(train, look_back)
X_test, Y_test = create_dataset(test, look_back)
# reshape input to be [samples, time steps, features]
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
dataset_2['distance_travelled'].shape
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(30))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
history = model.fit(X_train, Y_train, epochs=5, batch_size=200, validation_data=(X_test, Y_test),
callbacks=[EarlyStopping(monitor='val_loss', patience=10)], verbose=1, shuffle=False)
model.summary()
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)
# invert predictions
train_predict = scaler.inverse_transform(train_predict)
Y_train = scaler.inverse_transform([Y_train])
test_predict = scaler.inverse_transform(test_predict)
Y_test = scaler.inverse_transform([Y_test])
print('Train Mean Absolute Error:', mean_absolute_error(Y_train[0], train_predict[:,0]))
print('Train Root Mean Squared Error:',np.sqrt(mean_squared_error(Y_train[0], train_predict[:,0])))
print('Test Mean Absolute Error:', mean_absolute_error(Y_test[0], test_predict[:,0]))
print('Test Root Mean Squared Error:',np.sqrt(mean_squared_error(Y_test[0], test_predict[:,0])))
plt.figure(figsize=(8,4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Test Loss')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.legend(loc='upper right')
plt.show()
aa=[x for x in range(200)]
plt.figure(figsize=(8,4))
plt.plot(aa, Y_test[0][:200], marker='.', label="actual")
plt.plot(aa, test_predict[:,0][:200], 'r', label="prediction")
# plt.tick_params(left=False, labelleft=True) #remove ticks
plt.tight_layout()
sns.despine(top=True)
plt.ylabel('Distance Travelled', size=15)
plt.xlabel('Time step', size=15)
plt.legend(loc='best',fontsize=15)
plt.title('Test Data Predictions')
plt.show()
# clearly for the first dataset both Sarima and LSTM methods did pretty well
# but for the second dataset for all the three parameters the LSTM model failed miserably whereas the SARIMA model did pretty well
import geopandas as gpd
from shapely.geometry import Point,Polygon
import mplleaflet
import descartes
c=['start_location_lat','start_location_long']
points=dataset_2[c]
type(points)
gdf = gpd.GeoDataFrame(
points, geometry=gpd.points_from_xy(points.start_location_long, points.start_location_lat))
gdf=gdf[1:100 :]
gdf.plot()
mplleaflet.show()
ax = gdf.plot(markersize = 50, color = "red")
# 2. Convert plot to a web map:
mplleaflet.display(fig=ax.figure, crs=gdf.crs)
import folium
m = folium.Map(location=[30.25, -97.75], zoom_start=10, control_scale=True)
m
m2 = folium.Map(location=[30.25, -97.75], tiles='Stamen Toner',
zoom_start=12, control_scale=True, prefer_canvas=True)
m2
folium.Marker(
location=[30.25435, -97.75543],
popup=[30.25435, -97.75543],
icon=folium.Icon(color='green', icon='ok-sign'),
).add_to(m)
#Show map
m
# Get x and y coordinates for each point
gdf["x"] = gdf["geometry"].apply(lambda geom: geom.x)
gdf["y"] = gdf["geometry"].apply(lambda geom: geom.y)
# Create a list of coordinate pairs
locations = list(zip(gdf["y"], gdf["x"]))
len(locations)
from folium.plugins import HeatMap
# Create a Map instance
m = folium.Map(location=[30.25, -97.75], tiles = 'stamentoner', zoom_start=10, control_scale=True)
# Add heatmap to map instance
# Available parameters: HeatMap(data, name=None, min_opacity=0.5, max_zoom=18, max_val=1.0, radius=25, blur=15, gradient=None, overlay=True, control=True, show=True)
HeatMap(locations).add_to(m)
# Alternative syntax:
#m.add_child(HeatMap(points_array, radius=15))
# Show map
m
from folium.plugins import MarkerCluster
marker_cluster = MarkerCluster(locations)
# Add marker cluster to map
marker_cluster.add_to(m)
# Show map
m